import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
df = pd.read_csv("Life-Expectancy-Data-Updated.csv")
df.head()
| Country | Region | Year | Infant_deaths | Under_five_deaths | Adult_mortality | Alcohol_consumption | Hepatitis_B | Measles | BMI | ... | Diphtheria | Incidents_HIV | GDP_per_capita | Population_mln | Thinness_ten_nineteen_years | Thinness_five_nine_years | Schooling | Economy_status_Developed | Economy_status_Developing | Life_expectancy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Turkiye | Middle East | 2015 | 11.1 | 13.0 | 105.8240 | 1.32 | 97 | 65 | 27.8 | ... | 97 | 0.08 | 11006 | 78.53 | 4.9 | 4.8 | 7.8 | 0 | 1 | 76.5 |
| 1 | Spain | European Union | 2015 | 2.7 | 3.3 | 57.9025 | 10.35 | 97 | 94 | 26.0 | ... | 97 | 0.09 | 25742 | 46.44 | 0.6 | 0.5 | 9.7 | 1 | 0 | 82.8 |
| 2 | India | Asia | 2007 | 51.5 | 67.9 | 201.0765 | 1.57 | 60 | 35 | 21.2 | ... | 64 | 0.13 | 1076 | 1183.21 | 27.1 | 28.0 | 5.0 | 0 | 1 | 65.4 |
| 3 | Guyana | South America | 2006 | 32.8 | 40.5 | 222.1965 | 5.68 | 93 | 74 | 25.3 | ... | 93 | 0.79 | 4146 | 0.75 | 5.7 | 5.5 | 7.9 | 0 | 1 | 67.0 |
| 4 | Israel | Middle East | 2012 | 3.4 | 4.3 | 57.9510 | 2.89 | 97 | 89 | 27.0 | ... | 94 | 0.08 | 33995 | 7.91 | 1.2 | 1.1 | 12.8 | 1 | 0 | 81.7 |
5 rows × 21 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2864 entries, 0 to 2863 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 2864 non-null object 1 Region 2864 non-null object 2 Year 2864 non-null int64 3 Infant_deaths 2864 non-null float64 4 Under_five_deaths 2864 non-null float64 5 Adult_mortality 2864 non-null float64 6 Alcohol_consumption 2864 non-null float64 7 Hepatitis_B 2864 non-null int64 8 Measles 2864 non-null int64 9 BMI 2864 non-null float64 10 Polio 2864 non-null int64 11 Diphtheria 2864 non-null int64 12 Incidents_HIV 2864 non-null float64 13 GDP_per_capita 2864 non-null int64 14 Population_mln 2864 non-null float64 15 Thinness_ten_nineteen_years 2864 non-null float64 16 Thinness_five_nine_years 2864 non-null float64 17 Schooling 2864 non-null float64 18 Economy_status_Developed 2864 non-null int64 19 Economy_status_Developing 2864 non-null int64 20 Life_expectancy 2864 non-null float64 dtypes: float64(11), int64(8), object(2) memory usage: 470.0+ KB
df.isna().sum()
Country 0 Region 0 Year 0 Infant_deaths 0 Under_five_deaths 0 Adult_mortality 0 Alcohol_consumption 0 Hepatitis_B 0 Measles 0 BMI 0 Polio 0 Diphtheria 0 Incidents_HIV 0 GDP_per_capita 0 Population_mln 0 Thinness_ten_nineteen_years 0 Thinness_five_nine_years 0 Schooling 0 Economy_status_Developed 0 Economy_status_Developing 0 Life_expectancy 0 dtype: int64
df.describe()
| Year | Infant_deaths | Under_five_deaths | Adult_mortality | Alcohol_consumption | Hepatitis_B | Measles | BMI | Polio | Diphtheria | Incidents_HIV | GDP_per_capita | Population_mln | Thinness_ten_nineteen_years | Thinness_five_nine_years | Schooling | Economy_status_Developed | Economy_status_Developing | Life_expectancy | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 | 2864.000000 |
| mean | 2007.500000 | 30.363792 | 42.938268 | 192.251775 | 4.820882 | 84.292598 | 77.344972 | 25.032926 | 86.499651 | 86.271648 | 0.894288 | 11540.924930 | 36.675915 | 4.865852 | 4.899825 | 7.632123 | 0.206704 | 0.793296 | 68.856075 |
| std | 4.610577 | 27.538117 | 44.569974 | 114.910281 | 3.981949 | 15.995511 | 18.659693 | 2.193905 | 15.080365 | 15.534225 | 2.381389 | 16934.788931 | 136.485867 | 4.438234 | 4.525217 | 3.171556 | 0.405012 | 0.405012 | 9.405608 |
| min | 2000.000000 | 1.800000 | 2.300000 | 49.384000 | 0.000000 | 12.000000 | 10.000000 | 19.800000 | 8.000000 | 16.000000 | 0.010000 | 148.000000 | 0.080000 | 0.100000 | 0.100000 | 1.100000 | 0.000000 | 0.000000 | 39.400000 |
| 25% | 2003.750000 | 8.100000 | 9.675000 | 106.910250 | 1.200000 | 78.000000 | 64.000000 | 23.200000 | 81.000000 | 81.000000 | 0.080000 | 1415.750000 | 2.097500 | 1.600000 | 1.600000 | 5.100000 | 0.000000 | 1.000000 | 62.700000 |
| 50% | 2007.500000 | 19.600000 | 23.100000 | 163.841500 | 4.020000 | 89.000000 | 83.000000 | 25.500000 | 93.000000 | 93.000000 | 0.150000 | 4217.000000 | 7.850000 | 3.300000 | 3.400000 | 7.800000 | 0.000000 | 1.000000 | 71.400000 |
| 75% | 2011.250000 | 47.350000 | 66.000000 | 246.791375 | 7.777500 | 96.000000 | 93.000000 | 26.400000 | 97.000000 | 97.000000 | 0.460000 | 12557.000000 | 23.687500 | 7.200000 | 7.300000 | 10.300000 | 0.000000 | 1.000000 | 75.400000 |
| max | 2015.000000 | 138.100000 | 224.900000 | 719.360500 | 17.870000 | 99.000000 | 99.000000 | 32.100000 | 99.000000 | 99.000000 | 21.680000 | 112418.000000 | 1379.860000 | 27.700000 | 28.600000 | 14.100000 | 1.000000 | 1.000000 | 83.800000 |
df.describe(include=object)
| Country | Region | |
|---|---|---|
| count | 2864 | 2864 |
| unique | 179 | 9 |
| top | Turkiye | Africa |
| freq | 16 | 816 |
df.nunique()
Country 179 Region 9 Year 16 Infant_deaths 847 Under_five_deaths 1035 Adult_mortality 2850 Alcohol_consumption 1164 Hepatitis_B 80 Measles 87 BMI 120 Polio 77 Diphtheria 80 Incidents_HIV 393 GDP_per_capita 2564 Population_mln 1803 Thinness_ten_nineteen_years 200 Thinness_five_nine_years 207 Schooling 130 Economy_status_Developed 2 Economy_status_Developing 2 Life_expectancy 396 dtype: int64
fig, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(data=df, x='Region', y='Life_expectancy', palette='bright', ax=ax, showmeans=True,
meanprops={'marker': 'o', 'markerfacecolor': 'white', 'markeredgecolor': 'black'})
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.set_xlabel(None)
ax.set_ylabel("Life expectancy")
ax.set_title("Life expectancy by region")
legend_elements = [plt.Line2D([0],[0], color=sns.color_palette('bright')[i], lw=2, label=region)
for i, region in enumerate(df['Region'].unique())]
ax.legend(handles=legend_elements, title="Regions", loc="upper right", bbox_to_anchor=(1.5, 1))
plt.show()
sns.catplot(data = df, x='Region',y='Life_expectancy', kind='violin', aspect=2)
plt.xticks(rotation=90)
plt.show()
The above box plots (or box-and-whisker plots) are a graphical representation of my dataset's summary statistics.
The above boxplot shows the life expectancy of the people in various continents.
Middle East, South America, Central America, and Oceania have outliers.Africa has the widest and the lowest range of life expectancy whereas, North America has the narrowest and the highest range of life expectancy compared to the other regions.About the boxplots:
1. Box: The central part of the plot is a rectangular box. The box represents the interquartile range (IQR), which includes the middle 50% of the data. The bottom and top of the box mark the first quartile (Q1) and third quartile (Q3), respectively. The width of the box shows the spread of the data within this middle 50%.
2. Median: A vertical line or marker inside the box represents the median (Q2) of the data, which is the middle value when the data is sorted.
3. Whiskers: Lines extending from the top and bottom of the box, indicate the range of the data.
whis is set to [0, 100].4. Outliers: Any data points outside the whiskers are considered outliers and are usually plotted as individual points. Outliers are values that are significantly different from the rest of the data and can provide insights into data anomalies.
plt.figure(figsize=(15,10))
sns.heatmap(data=df.corr(), annot=True)
plt.show()
/var/folders/27/m6jr_n_d2tx7504nd_ywk5b40000gn/T/ipykernel_20757/3381459498.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(data=df.corr(), annot=True)
average_timeline = df.groupby('Year')['Life_expectancy'].mean()
average_compare = df[df['Year'].isin([2000, 2015])].groupby(['Year', 'Region','Country'])['Life_expectancy'].median()
sns.lineplot(data = average_timeline)
plt.xlabel('Years')
plt.ylabel('Life expectancy')
plt.title('Average life expectancy around the world, since 2000-2015.')
plt.show()
sns.lineplot(data = df[df['Region'].isin(['Africa', 'Asia', 'North America'])], x = 'Year', y = 'Life_expectancy', hue = 'Region')
plt.xlabel('Years')
plt.ylabel('Life expectancy')
plt.title('Average life expectancy around the world, since 2000-2015.')
plt.show()
sns.kdeplot(data = df[df['Year'].isin([2000, 2015])], x = 'Life_expectancy',
hue = 'Year', fill = True, palette = sns.color_palette('BrBG', 2))
plt.axvline(x = df[df['Year'] == 2000]['Life_expectancy'].median(), linestyle = '--', color = 'tan')
plt.axvline(x = df[df['Year'] == 2015]['Life_expectancy'].median(), linestyle = '--', color = 'darkcyan')
plt.text(df[df['Year'] == 2000]['Life_expectancy'].median() + 2, y = 0.015,
s = 'Median life expectancy at year 2000', color = 'tan')
plt.text(df[df['Year'] == 2015]['Life_expectancy'].median() + 2, y = 0.020,
s = 'Median life expectancy at year 2015', color = 'darkcyan')
plt.xlabel('Life expectancy')
plt.title('Average life expectancy of people around the world comparison between 2000 and 2015.')
plt.show()
sns.pointplot(data = df[df['Year'].isin([2000, 2015])], x = 'Region', y = 'Life_expectancy',
hue = 'Year', estimator = np.median, palette = sns.color_palette('BrBG', 2))
plt.xlabel('Regions')
plt.ylabel('Life expectancy')
plt.xticks(rotation = 90)
plt.title('Average life expectancy of population in each region comparison between 2000 and 2015')
plt.show()
sns.barplot(data = df[df['Year'].isin([2000, 2015])], x = 'Year', y = 'Infant_deaths', hue = 'Region')
plt.ylabel('Infant death rates')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.show()
df_copy3 = df.copy()
df_copy3 = df_copy3[['Schooling', 'Economy_status_Developed', 'Economy_status_Developing']]
df_copy3.head()
| Schooling | Economy_status_Developed | Economy_status_Developing | |
|---|---|---|---|
| 0 | 7.8 | 0 | 1 |
| 1 | 9.7 | 1 | 0 |
| 2 | 5.0 | 0 | 1 |
| 3 | 7.9 | 0 | 1 |
| 4 | 12.8 | 1 | 0 |
sns.catplot(data = df, x='Region', y='Life_expectancy', hue='Economy_status_Developed', kind='box', aspect=2)
plt.xticks(rotation=90)
plt.show()
df_copy1 = df.copy()
df_copy1 = df_copy1.drop(columns=['Year', 'Adult_mortality', 'Alcohol_consumption', 'Incidents_HIV', 'BMI', 'GDP_per_capita', 'Population_mln', 'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed', 'Economy_status_Developing'])
df_copy1.head()
| Country | Region | Infant_deaths | Under_five_deaths | Hepatitis_B | Measles | Polio | Diphtheria | Life_expectancy | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | Turkiye | Middle East | 11.1 | 13.0 | 97 | 65 | 97 | 97 | 76.5 |
| 1 | Spain | European Union | 2.7 | 3.3 | 97 | 94 | 97 | 97 | 82.8 |
| 2 | India | Asia | 51.5 | 67.9 | 60 | 35 | 67 | 64 | 65.4 |
| 3 | Guyana | South America | 32.8 | 40.5 | 93 | 74 | 92 | 93 | 67.0 |
| 4 | Israel | Middle East | 3.4 | 4.3 | 97 | 89 | 94 | 94 | 81.7 |
sns.pairplot(data = df_copy1, diag_kws={'common_norm': False})
plt.show()
df_copy2 = df.copy()
df_copy2 = df_copy2.drop(columns=['Year', 'Infant_deaths', 'Hepatitis_B', 'Under_five_deaths', 'Diphtheria', 'Measles', 'Polio', 'GDP_per_capita', 'Population_mln', 'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed', 'Economy_status_Developing'])
df_copy2.head()
| Country | Region | Adult_mortality | Alcohol_consumption | BMI | Incidents_HIV | Life_expectancy | |
|---|---|---|---|---|---|---|---|
| 0 | Turkiye | Middle East | 105.8240 | 1.32 | 27.8 | 0.08 | 76.5 |
| 1 | Spain | European Union | 57.9025 | 10.35 | 26.0 | 0.09 | 82.8 |
| 2 | India | Asia | 201.0765 | 1.57 | 21.2 | 0.13 | 65.4 |
| 3 | Guyana | South America | 222.1965 | 5.68 | 25.3 | 0.79 | 67.0 |
| 4 | Israel | Middle East | 57.9510 | 2.89 | 27.0 | 0.08 | 81.7 |
sns.pairplot(data = df_copy2, hue='Life_expectancy', diag_kws={'common_norm': False}, palette='viridis')
plt.show()
sns.jointplot(data = df, x='Life_expectancy', y='Schooling', hue='Economy_status_Developed',
kind='kde',
marginal_kws={'common_norm': False})
plt.show()
sns.jointplot(data = df, x='Life_expectancy', y='Infant_deaths', hue='Economy_status_Developed',
marginal_kws={'common_norm': False})
plt.show()
sns.scatterplot(data = df, y='Polio', x='Infant_deaths', hue='Region')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.show()
sns.scatterplot(data = df, y='Diphtheria', x='Infant_deaths', hue='Region')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.show()
sns.scatterplot(data = df, y='BMI', x='Adult_mortality', hue='Region')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.show()
sns.scatterplot(data = df, y='BMI', x='Adult_mortality', hue='Region')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.show()
sns.pointplot(data = df[df['Year'].isin([2015])], x = 'Region', y = 'GDP_per_capita',
hue = 'Year', estimator = np.median, palette = sns.color_palette('BrBG', 2))
plt.xlabel('Regions')
plt.ylabel('GDP Per Capita')
plt.xticks(rotation = 90)
# plt.title('Average life expectancy of population in each region comparison between 2000 and 2015')
plt.show()